The following markdown document contains the majority of code used to generate figures displayed in Figure 1, Supplementary Figure 1 & 2, and Supplementary Table 1. The code is not optimized for markdown viewership, as each plot was individually saved as a pdf.
rm(list = ls())
if (is.integer(dev.list())){dev.off()}
## null device
## 1
cat("\014")
set.seed(1)
source("functions.R")
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.1 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## mixtools package, version 1.2.0, Released 2020-02-05
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
## colnames, colSums, dirname, do.call, duplicated, eval, evalq,
## Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
## pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
## rowSums, sapply, setdiff, sort, table, tapply, union, unique,
## unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
##
## first, second
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
##
## shift
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
##
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
## ========================================
## circlize version 0.4.9
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
## Loading required package: XML
## Loading required package: grid
##
## Attaching package: 'grid'
## The following object is masked from 'package:mixtools':
##
## depth
## ========================================
## ComplexHeatmap version 1.20.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://bioconductor.org/packages/ComplexHeatmap/
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
## ========================================
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
##
## get_legend
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux
##
## Matrix products: default
## BLAS: /software/x86_64/R/3.5.1/lib64/R/lib/libRblas.so
## LAPACK: /software/x86_64/R/3.5.1/lib64/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices datasets
## [8] utils methods base
##
## other attached packages:
## [1] ggpubr_0.3.0 ComplexHeatmap_1.20.0 RColorBrewer_1.1-2
## [4] annotate_1.60.1 XML_3.99-0.3 ggbeeswarm_0.6.0
## [7] cowplot_1.0.0 circlize_0.4.9 gplots_3.0.3
## [10] ggthemes_4.2.0 org.Hs.eg.db_3.7.0 AnnotationDbi_1.44.0
## [13] IRanges_2.16.0 S4Vectors_0.20.1 Biobase_2.42.0
## [16] BiocGenerics_0.28.0 biomaRt_2.38.0 PRROC_1.3.1
## [19] permute_0.9-5 mixtools_1.2.0 knitr_1.28
## [22] data.table_1.12.8 forcats_0.5.0 stringr_1.4.0
## [25] dplyr_0.8.5 purrr_0.3.4 readr_1.3.1
## [28] tidyr_1.1.0 tibble_3.0.1 ggplot2_3.3.1
## [31] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ggsignif_0.6.0 rjson_0.2.20
## [4] ellipsis_0.3.1 rio_0.5.16 GlobalOptions_0.1.2
## [7] fs_1.4.1 rstudioapi_0.11 bit64_0.9-7
## [10] fansi_0.4.1 lubridate_1.7.9 xml2_1.3.2
## [13] splines_3.5.1 jsonlite_1.6.1 broom_0.5.6
## [16] kernlab_0.9-29 dbplyr_1.4.4 compiler_3.5.1
## [19] httr_1.4.1 backports_1.1.7 assertthat_0.2.1
## [22] Matrix_1.2-18 cli_2.0.2 htmltools_0.4.0
## [25] prettyunits_1.1.1 tools_3.5.1 gtable_0.3.0
## [28] glue_1.4.1 Rcpp_1.0.4.6 carData_3.0-4
## [31] cellranger_1.1.0 vctrs_0.3.1 gdata_2.18.0
## [34] nlme_3.1-148 xfun_0.14 openxlsx_4.1.5
## [37] rvest_0.3.5 lifecycle_0.2.0 renv_0.12.0-3
## [40] gtools_3.8.2 rstatix_0.5.0 MASS_7.3-51.6
## [43] scales_1.1.1 hms_0.5.3 yaml_2.2.1
## [46] curl_4.3 memoise_1.1.0 segmented_1.1-0
## [49] stringi_1.4.6 RSQLite_2.2.0 caTools_1.17.1.2
## [52] zip_2.0.4 shape_1.4.4 rlang_0.4.6
## [55] pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14
## [58] lattice_0.20-41 bit_1.1-15.2 tidyselect_1.1.0
## [61] magrittr_1.5 R6_2.4.1 generics_0.0.2
## [64] DBI_1.1.0 pillar_1.4.4 haven_2.3.1
## [67] foreign_0.8-70 withr_2.2.0 abind_1.4-5
## [70] survival_3.1-12 RCurl_1.98-1.2 modelr_0.1.8
## [73] crayon_1.3.4 car_3.0-8 KernSmooth_2.23-17
## [76] rmarkdown_2.2 GetoptLong_0.1.8 progress_1.2.2
## [79] readxl_1.3.1 blob_1.2.1 reprex_0.3.0
## [82] digest_0.6.25 xtable_1.8-4 munsell_0.5.0
## [85] beeswarm_0.2.3 vipor_0.4.5
###Load in Data
Avana_Shuffle_Corrected_FC <- read.delim("Data/Avana_Shuffle_Corrected_FC.txt")
cell_line_key <- read_csv("Data/DepMap-2018q4-celllines.csv")
## Parsed with column specification:
## cols(
## DepMap_ID = col_character(),
## CCLE_Name = col_character(),
## Aliases = col_character(),
## COSMIC_ID = col_double(),
## `Sanger ID` = col_double(),
## `Primary Disease` = col_character(),
## `Subtype Disease` = col_character(),
## Gender = col_character(),
## Source = col_character()
## )
## Warning: 2 parsing failures.
## row col expected actual file
## 1329 Source delimiter or quote G 'Data/DepMap-2018q4-celllines.csv'
## 1329 NA 9 columns 15 columns 'Data/DepMap-2018q4-celllines.csv'
###Picking 4 genes noted to behave in pathway, we observed these genes to occur together early on.
key.gns <- c("CDKN1A","TP53","CHEK2","TP53BP1")
tp53_data <- Avana_Shuffle_Corrected_FC %>% filter(GENE %in% key.gns)
tp53_data <- aggregate(tp53_data$MEAN_FC, by=list(tp53_data$Cell_Line),FUN=mean, na.rm=TRUE)
###Taking the cell line demonstrating the highest mean between these genes
tp53_data <- tp53_data %>% filter(x == (max(x))) %>% droplevels() %>% pull(Group.1)
plot_data <- Avana_Shuffle_Corrected_FC %>% filter(Cell_Line %in% tp53_data)
tp53_data <- cell_line_key %>% filter(DepMap_ID == tp53_data) %>% droplevels %>% pull(Aliases) %>% as.character()
###Taking a mixed model of the screen for non-essential/essential display purposes
mixmdl <- normalmixEM(plot_data$MEAN_FC, k = 2)
## number of iterations= 60
plot_labels <- plot_data %>% filter(GENE %in% key.gns)
head(plot_data)
## Cell_Line GENE MEAN_FC MEAN_FC_Shuffle_Mean SD_Shuffle_Mean n n_above
## 1 ACH-001067 A1BG 0.461002 -0.033523 0.554438 8 6
## 2 ACH-001067 A1CF 0.334723 -0.033523 0.554438 8 4
## 3 ACH-001067 A2M 0.081316 -0.033523 0.554438 8 3
## 4 ACH-001067 A2ML1 0.342936 -0.033523 0.554438 8 5
## 5 ACH-001067 A3GALT2 -0.059036 -0.033523 0.554438 8 5
## 6 ACH-001067 A4GALT 0.590651 -0.033523 0.554438 8 5
## percentage
## 1 0.750
## 2 0.500
## 3 0.375
## 4 0.625
## 5 0.625
## 6 0.625
#pdf("./paper_figs_code/fig/1-Distribution_W_Goal_Cell_Cycle.pdf",height = 10,width = 20)
data.frame(x = mixmdl$x) %>%
ggplot() + ggtitle(tp53_data) +
geom_density(aes(plot_data$MEAN_FC), colour = "black",
fill = "lightgray") +
stat_function(geom = "line", fun = plot_mix_comps,
args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
colour = "red", lwd = 1,alpha = 0.6) +
stat_function(geom = "line", fun = plot_mix_comps,
args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
colour = "blue", lwd = 1,alpha = 0.6) +
ylab("Density") + xlab("Mean Log FC") + theme_Publication() +
###labeling the gene points
geom_text(
data = plot_labels,
aes(x=plot_labels$MEAN_FC, y = 0.05),
label = plot_labels$GENE, hjust = 0,vjust = "inward",
size = 4, angle = 45, colour = "black"
)
## Warning: Use of `plot_labels$MEAN_FC` is discouraged. Use `MEAN_FC` instead.
CCLE_mutations <- read.csv("Data/CCLE_mutations.csv")
###Identifying TP53 and PTEN Mutations
mutations <- CCLE_mutations %>% filter(Hugo_Symbol %in% c("TP53","PTEN")) %>%
filter(Variant_Classification != "Silent") %>% dplyr::select(Hugo_Symbol,DepMap_ID) %>%
unique()
plot_data <- Avana_Shuffle_Corrected_FC %>% filter(GENE == "TP53")
TP53_mut <- mutations %>% filter(Hugo_Symbol == "TP53")
plot_data$`Gene Status` = "None Detected"
plot_data$`Gene Status`[plot_data$Cell_Line %in% TP53_mut$DepMap_ID] = "Mutation"
p1 <- plot_data %>% ggplot(aes(x = `Gene Status`,y = MEAN_FC)) +
geom_beeswarm() +
theme_Publication() +
ggtitle("TP53") +
theme(plot.title = element_text(margin = margin(b = -10)),legend.position = 'none',text = element_text(size=24)) + xlab("") + ylab("Mean Fold-Change") +
stat_compare_means(method = "wilcox",size = 8)
plot_data <- Avana_Shuffle_Corrected_FC %>% filter(GENE == "PTEN")
PTEN_mut <- mutations %>% filter(Hugo_Symbol == "PTEN")
plot_data$`Gene Status` = "None Detected"
plot_data$`Gene Status`[plot_data$Cell_Line %in% PTEN_mut$DepMap_ID] = "Mutation"
p2 <- plot_data %>% ggplot(aes(x = `Gene Status`,y = MEAN_FC)) +
geom_beeswarm() +
theme_Publication() +
ggtitle("PTEN") +
theme(plot.title = element_text(margin = margin(b = -10)),legend.position = 'none',text = element_text(size=24)) +
xlab("") + ylab("Mean Fold-Change") +
stat_compare_means(method = "wilcox",size = 8)
#Demonstrating differences between mutation profiles of PTEN and TP53
plot_grid(p1, p2)
The following code block is some of the initial data manipulation used for the precision, count plot, as well as corresponding box plots observed in figure 1.
###Corrected FC Data
Avana_Shuffle_Corrected_FC <-
read.delim("Data/Avana_Shuffle_Corrected_FC.txt")
AVANA_Q2_19_FC <-
read.delim("Data/AVANA_Q2_19_FC.txt")
cell_line_key <- read_csv("Data/DepMap-2018q4-celllines.csv")
## Parsed with column specification:
## cols(
## DepMap_ID = col_character(),
## CCLE_Name = col_character(),
## Aliases = col_character(),
## COSMIC_ID = col_double(),
## `Sanger ID` = col_double(),
## `Primary Disease` = col_character(),
## `Subtype Disease` = col_character(),
## Gender = col_character(),
## Source = col_character()
## )
## Warning: 2 parsing failures.
## row col expected actual file
## 1329 Source delimiter or quote G 'Data/DepMap-2018q4-celllines.csv'
## 1329 NA 9 columns 15 columns 'Data/DepMap-2018q4-celllines.csv'
key.gns <- c("CDKN1A", "TP53", "CHEK2", "TP53BP1")
tp53_data <-
Avana_Shuffle_Corrected_FC %>% filter(GENE %in% key.gns)
tp53_data <-
aggregate(
tp53_data$MEAN_FC,
by = list(tp53_data$Cell_Line),
FUN = mean,
na.rm = TRUE
)
###identifying F5 again
tp53_data <-
tp53_data %>% filter(x == (max(x))) %>% droplevels() %>% pull(Group.1)
plot_data <-
Avana_Shuffle_Corrected_FC %>% filter(Cell_Line %in% tp53_data)
tp53_name <-
cell_line_key %>% filter(DepMap_ID == tp53_data) %>% droplevels %>% pull(Aliases) %>% as.character()
dat <- merge(Avana_Shuffle_Corrected_FC, AVANA_Q2_19_FC)
rm(AVANA_Q2_19_FC, Avana_Shuffle_Corrected_FC)
cancer_gene_census <- read_csv("Data/cancer_gene_census.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Entrez GeneId` = col_double(),
## Tier = col_double()
## )
## See spec(...) for full column specifications.
###Loading in Nonessential genes, COSMIC genes, as well as other methods attempted
NEGv1 <- read.delim("Data/NEGv1.txt")
cosmic_tsg <- cancer_gene_census %>%
filter(grepl("TSG", `Role in Cancer`))
CERES <- read_csv("Data/CERES.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
CERES_cols <- CERES$X1
CERES$X1 <- NULL
CERES <- as.data.frame(t(CERES))
CERES$GENE <- rownames(CERES)
CERES_GENEs = str_split_fixed(CERES$GENE, " [(]", 2)
CERES$GENE <- NULL
colnames(CERES) <- CERES_cols
CERES$GENE = as.vector(CERES_GENEs[1:17634,1])
CERES <- gather(CERES,Cell_Line,Score,-GENE) #Positive Scores = NonEssentials
V1 <- as.vector(as.character(CERES$GENE))
entrez_IDS <- mapIds(org.Hs.eg.db, V1, 'ENTREZID', 'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
CERES$entrez_ID = entrez_IDS
CERES$Cosmic_TSG = 0
CERES$Cosmic_TSG[CERES$entrez_ID %in% cosmic_tsg$`Entrez GeneId`] = 1
CERES$NEG = 0
CERES$NEG[CERES$entrez_ID %in% NEGv1$ENTREZ_ID] = 1
JACKS <-
read_delim("Data/JACKS_result_gene_JACKS_results.txt", "\t")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene = col_character()
## )
## See spec(...) for full column specifications.
JACKS <- gather(JACKS, Cell_Line, Score, -Gene)
colnames(JACKS) <-
c("GENE", "Cell_Line", "Score") #Positive Scores = NonEssentials
V1 <- as.vector(as.character(JACKS$GENE))
entrez_IDS <- mapIds(org.Hs.eg.db, V1, 'ENTREZID', 'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
JACKS$entrez_ID = entrez_IDS
JACKS$Cosmic_TSG = 0
JACKS$Cosmic_TSG[JACKS$entrez_ID %in% cosmic_tsg$`Entrez GeneId`] = 1
JACKS$NEG = 0
JACKS$NEG[JACKS$entrez_ID %in% NEGv1$ENTREZ_ID] = 1
bf_list <-
read.delim("Data/table_Avana2019Q2_CRISPRcleanR_corrected_all")
bf_list <- bf_list %>% gather(Cell_Line, BF, -GENE)
bf_list$Cell_Line = gsub("[.]", "-", bf_list$Cell_Line)
V1 <- as.vector(as.character(bf_list$GENE))
entrez_IDS <- mapIds(org.Hs.eg.db, V1, 'ENTREZID', 'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
bf_list$entrez_ID = entrez_IDS
bf_list$Cosmic_TSG = 0
bf_list$Cosmic_TSG[bf_list$entrez_ID %in% cosmic_tsg$`Entrez GeneId`] = 1
bf_list$NEG = 0
bf_list$NEG[bf_list$entrez_ID %in% NEGv1$ENTREZ_ID] = 1
V1 <- as.vector(as.character(dat$GENE))
entrez_IDS <- mapIds(org.Hs.eg.db, V1, 'ENTREZID', 'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
dat$entrez_ID = entrez_IDS
dat$Cosmic_TSG = 0
dat$Cosmic_TSG[dat$entrez_ID %in% cosmic_tsg$`Entrez GeneId`] = 1
dat$NEG = 0
dat$NEG[dat$entrez_ID %in% NEGv1$ENTREZ_ID] = 1
rm(V1, entrez_IDS, CERES_GENEs)
###Calculating corrected modified Z score
dat$Mod_Z_Score = (dat$MEAN_FC - dat$MEAN_FC_Shuffle_Mean) / dat$SD_Shuffle_Mean
###Identifying oncogenes, need to omit TSG observations
oncogenes_all <-
cancer_gene_census %>% filter(`Role in Cancer` %in% c("oncogene, fusion", "oncogene")) %>%
droplevels() %>% pull(`Entrez GeneId`) %>% unique()
###Rank method based on MEAN_FC
dat$ranking <- ave(dat$MEAN_FC, dat$Cell_Line, FUN = rank)
cells_JACKS <- JACKS$Cell_Line %>% unique() %>% as.vector()
dat <- dat %>% filter(Cell_Line %in% cells_JACKS) %>% droplevels()
CERES <-
CERES %>% filter(Cell_Line %in% cells_JACKS) %>% droplevels()
bf_list <-
bf_list %>% filter(Cell_Line %in% cells_JACKS) %>% droplevels()
###TSG All vs Oncogenes + NEG, this is the code use to identify FDR cutoff of 10%
oncogenes_scores <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(Mod_Z_Score)
NEG_scores <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(NEG == 1) %>% droplevels() %>% pull(Mod_Z_Score)
tsg_scores <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(Mod_Z_Score)
type <- c()
type[1:length(oncogenes_scores)] <- "Oncogenes"
type[length(oncogenes_scores) + 1:length(NEG_scores)] <-
"Non-Essential"
type[length(oncogenes_scores) + length(NEG_scores) + 1:length(tsg_scores)] <-
"Tumor Suppressor"
scores <- c(oncogenes_scores, NEG_scores, tsg_scores)
dat_boxplot <- data.frame(type, scores)
plot_labels <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(GENE %in% key.gns) %>% dplyr::select(GENE, Mod_Z_Score)
plot_labels$type = "Tumor Suppressor"
colnames(plot_labels)[2] = "scores"
dat_boxplot <- merge(dat_boxplot, plot_labels, all = TRUE)
my_comparisons <-
list(c("Oncogenes", "Tumor Suppressor"),
c("Non-Essential", "Tumor Suppressor"))
p <-
dat_boxplot %>% ggplot(aes(y = scores, x = type, fill = type)) + geom_boxplot() + theme_Publication() +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 18
)) +
theme(legend.position = 'none') + ylab("Shuffled Z-Score") + xlab("Gene Type") +
stat_compare_means(comparisons = my_comparisons) + ggtitle(tp53_name) +
scale_fill_manual(values = c("#999999", "#E69F00", "#699186")) + geom_text(
data = dat_boxplot,
aes(type, scores, label = GENE),
vjust = -.5,
check_overlap = TRUE
)
p$layers[[2]]$aes_params$textsize <- 8
p
## Warning: Removed 1274 rows containing missing values (geom_text).
oncogenes_scores <-
dat %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(Mod_Z_Score)
NEG_scores <-
dat %>% filter(NEG == 1) %>% droplevels() %>% pull(Mod_Z_Score)
tsg_scores <-
dat %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(Mod_Z_Score)
FP = c(oncogenes_scores, NEG_scores)
pr_mod_Z <-
pr.curve(scores.class0 = tsg_scores ,
scores.class1 = FP,
curve = TRUE)
pr_mod_Z_dat <-
data.frame(
Recall = pr_mod_Z$curve[, 1],
Precision = pr_mod_Z$curve[, 2],
Score = pr_mod_Z$curve[, 3]
)
### rounding for data storage purposes
pr_mod_Z_dat$Precision = round(pr_mod_Z_dat$Precision, digits = 5)
pr_mod_Z_dat$Recall = round(pr_mod_Z_dat$Recall, digits = 5)
pr_mod_Z_dat$Score = round(pr_mod_Z_dat$Score, digits = 2)
pr_mod_Z_dat <- pr_mod_Z_dat %>% unique()
###saving modified pr scores for mod Z
#write_delim(pr_mod_Z_dat,"./Data/Mod_Z_pr_values.txt",delim = "\t")
pr_mod_Z_dat <-
data.frame(
Recall = pr_mod_Z$curve[, 1],
Precision = pr_mod_Z$curve[, 2],
Score = pr_mod_Z$curve[, 3]
)
pr_mod_Z_dat$Precision = round(pr_mod_Z_dat$Precision, digits = 2)
pr_mod_Z_dat$Recall = round(pr_mod_Z_dat$Recall, digits = 2)
pr_mod_Z_dat$Score = round(pr_mod_Z_dat$Score, digits = 2)
pr_mod_Z_dat <- pr_mod_Z_dat %>% unique()
###function is specific for each method value
observation_count <-
sapply(pr_mod_Z_dat$Score, function(x)
length(tsg_scores[tsg_scores > x]))
pr_mod_Z_dat$count <- observation_count
pr_mod_Z_dat <- pr_mod_Z_dat %>% arrange(count)
oncogenes_scores <-
dat %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(MEAN_FC)
NEG_scores <-
dat %>% filter(NEG == 1) %>% droplevels() %>% pull(MEAN_FC)
tsg_scores <-
dat %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(MEAN_FC)
FP = c(oncogenes_scores, NEG_scores)
pr_MEAN_FC <-
pr.curve(scores.class0 = tsg_scores ,
scores.class1 = FP,
curve = TRUE)
pr_MEAN_FC_dat <-
data.frame(
Recall = pr_MEAN_FC$curve[, 1],
Precision = pr_MEAN_FC$curve[, 2],
Score = pr_MEAN_FC$curve[, 3]
)
pr_MEAN_FC_dat$Precision = round(pr_MEAN_FC_dat$Precision, digits = 2)
pr_MEAN_FC_dat$Recall = round(pr_MEAN_FC_dat$Recall, digits = 2)
pr_MEAN_FC_dat$Score = round(pr_MEAN_FC_dat$Score, digits = 2)
pr_MEAN_FC_dat <- pr_MEAN_FC_dat %>% unique()
observation_count <-
sapply(pr_MEAN_FC_dat$Score, function(x)
length(tsg_scores[tsg_scores > x]))
pr_MEAN_FC_dat$count <- observation_count
pr_MEAN_FC_dat <- pr_MEAN_FC_dat %>% arrange(count)
oncogenes_scores <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(MEAN_FC)
NEG_scores <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(NEG == 1) %>% droplevels() %>% pull(MEAN_FC)
tsg_scores <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(MEAN_FC)
type <- c()
type[1:length(oncogenes_scores)] <- "Oncogenes"
type[length(oncogenes_scores) + 1:length(NEG_scores)] <-
"Non-Essential"
type[length(oncogenes_scores) + length(NEG_scores) + 1:length(tsg_scores)] <-
"Tumor Suppressor"
scores <- c(oncogenes_scores, NEG_scores, tsg_scores)
dat_boxplot <- data.frame(type, scores)
plot_labels <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(GENE %in% key.gns) %>% dplyr::select(GENE, MEAN_FC)
plot_labels$type = "Tumor Suppressor"
colnames(plot_labels)[2] = "scores"
dat_boxplot <- merge(dat_boxplot, plot_labels, all = TRUE)
my_comparisons <-
list(c("Oncogenes", "Tumor Suppressor"),
c("Non-Essential", "Tumor Suppressor"))
p <-
dat_boxplot %>% ggplot(aes(y = scores, x = type, fill = type)) + geom_boxplot() + theme_Publication() +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 18
)) +
theme(legend.position = 'none') + ylab("Mean FC") + xlab("Gene Type") +
stat_compare_means(comparisons = my_comparisons) + ggtitle(tp53_name) +
scale_fill_manual(values = c("#999999", "#E69F00", "#699186")) + geom_text(
data = dat_boxplot,
aes(type, scores, label = GENE),
vjust = -.5,
check_overlap = TRUE
)
p$layers[[2]]$aes_params$textsize <- 8
p
## Warning: Removed 1274 rows containing missing values (geom_text).
Box plot not made since it is identical to MEAN FC for single screen
oncogenes_scores <-
dat %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(ranking)
NEG_scores <-
dat %>% filter(NEG == 1) %>% droplevels() %>% pull(ranking)
tsg_scores <-
dat %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(ranking)
FP = c(oncogenes_scores, NEG_scores)
pr_ranking <-
pr.curve(scores.class0 = tsg_scores ,
scores.class1 = FP,
curve = TRUE)
pr_ranking_dat <-
data.frame(
Recall = pr_ranking$curve[, 1],
Precision = pr_ranking$curve[, 2],
Score = pr_ranking$curve[, 3]
)
pr_ranking_dat$Precision = round(pr_ranking_dat$Precision, digits = 2)
pr_ranking_dat$Recall = round(pr_ranking_dat$Recall, digits = 2)
pr_ranking_dat$Score = round(pr_ranking_dat$Score, digits = 2)
pr_ranking_dat <- pr_ranking_dat %>% unique()
observation_count <-
sapply(pr_ranking_dat$Score, function(x)
length(tsg_scores[tsg_scores > x]))
pr_ranking_dat$count <- observation_count
pr_ranking_dat <- pr_ranking_dat %>% arrange(count)
oncogenes_scores <-
JACKS %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(Score)
NEG_scores <-
JACKS %>% filter(NEG == 1) %>% droplevels() %>% pull(Score)
tsg_scores <-
JACKS %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(Score)
FP = c(oncogenes_scores, NEG_scores)
pr_JACKS <-
pr.curve(scores.class0 = tsg_scores ,
scores.class1 = FP,
curve = TRUE)
pr_JACKS_dat <-
data.frame(
Recall = pr_JACKS$curve[, 1],
Precision = pr_JACKS$curve[, 2],
Score = pr_JACKS$curve[, 3]
)
pr_JACKS_dat$Precision = round(pr_JACKS_dat$Precision, digits = 2)
pr_JACKS_dat$Recall = round(pr_JACKS_dat$Recall, digits = 2)
pr_JACKS_dat$Score = round(pr_JACKS_dat$Score, digits = 2)
pr_JACKS_dat <- pr_JACKS_dat %>% unique()
observation_count <-
sapply(pr_JACKS_dat$Score, function(x)
length(tsg_scores[tsg_scores > x]))
pr_JACKS_dat$count <- observation_count
pr_JACKS_dat <- pr_JACKS_dat %>% arrange(count)
oncogenes_scores <-
JACKS %>% filter(Cell_Line == tp53_data) %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(Score)
NEG_scores <-
JACKS %>% filter(Cell_Line == tp53_data) %>% filter(NEG == 1) %>% droplevels() %>% pull(Score)
tsg_scores <-
JACKS %>% filter(Cell_Line == tp53_data) %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(Score)
type <- c()
type[1:length(oncogenes_scores)] <- "Oncogenes"
type[length(oncogenes_scores) + 1:length(NEG_scores)] <-
"Non-Essential"
type[length(oncogenes_scores) + length(NEG_scores) + 1:length(tsg_scores)] <-
"Tumor Suppressor"
scores <- c(oncogenes_scores, NEG_scores, tsg_scores)
dat_boxplot <- data.frame(type, scores)
plot_labels <-
dat %>% filter(Cell_Line == as.character(tp53_data)) %>% filter(GENE %in% key.gns) %>% dplyr::select(GENE, Mod_Z_Score)
plot_labels$type = "Tumor Suppressor"
colnames(plot_labels)[2] = "scores"
dat_boxplot <- merge(dat_boxplot, plot_labels, all = TRUE)
my_comparisons <-
list(c("Oncogenes", "Tumor Suppressor"),
c("Non-Essential", "Tumor Suppressor"))
p <-
dat_boxplot %>% ggplot(aes(y = scores, x = type, fill = type)) + geom_boxplot() + theme_Publication() +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 18
)) +
theme(legend.position = 'none') + ylab("JACKS Score") + xlab("Gene Type") +
stat_compare_means(comparisons = my_comparisons) + ggtitle(tp53_name) +
scale_fill_manual(values = c("#999999", "#E69F00", "#699186")) + geom_text(
data = dat_boxplot,
aes(type, scores, label = GENE),
vjust = -.5,
check_overlap = TRUE
)
p$layers[[2]]$aes_params$textsize <- 8
p
## Warning: Removed 1282 rows containing missing values (geom_text).
oncogenes_scores <-
CERES %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(Score)
NEG_scores <-
CERES %>% filter(NEG == 1) %>% droplevels() %>% pull(Score)
tsg_scores <-
CERES %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(Score)
FP = c(oncogenes_scores, NEG_scores)
pr_ceres <-
pr.curve(scores.class0 = tsg_scores ,
scores.class1 = FP,
curve = TRUE)
pr_ceres_dat <-
data.frame(
Recall = pr_ceres$curve[, 1],
Precision = pr_ceres$curve[, 2],
Score = pr_ceres$curve[, 3]
)
pr_ceres_dat$Precision = round(pr_ceres_dat$Precision, digits = 2)
pr_ceres_dat$Recall = round(pr_ceres_dat$Recall, digits = 2)
pr_ceres_dat$Score = round(pr_ceres_dat$Score, digits = 2)
pr_ceres_dat <- pr_ceres_dat %>% unique()
observation_count <-
sapply(pr_ceres_dat$Score, function(x)
length(tsg_scores[tsg_scores > x]))
pr_ceres_dat$count <- observation_count
pr_ceres_dat <- pr_ceres_dat %>% arrange(count)
oncogenes_scores <-
CERES %>% filter(Cell_Line == tp53_data) %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(Score)
NEG_scores <-
CERES %>% filter(Cell_Line == tp53_data) %>% filter(NEG == 1) %>% droplevels() %>% pull(Score)
tsg_scores <-
CERES %>% filter(Cell_Line == tp53_data) %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(Score)
type <- c()
type[1:length(oncogenes_scores)] <- "Oncogenes"
type[length(oncogenes_scores) + 1:length(NEG_scores)] <-
"Non-Essential"
type[length(oncogenes_scores) + length(NEG_scores) + 1:length(tsg_scores)] <-
"Tumor Suppressor"
scores <- c(oncogenes_scores, NEG_scores, tsg_scores)
plot_labels <-
CERES %>% filter(Cell_Line == tp53_data) %>% filter(GENE %in% key.gns) %>% dplyr::select(GENE, Score)
plot_labels$type = "Tumor Suppressor"
colnames(plot_labels)[2] = "scores"
scores <- c(oncogenes_scores, NEG_scores, tsg_scores)
dat_boxplot <- data.frame(type, scores)
dat_boxplot <- merge(dat_boxplot, plot_labels, all = TRUE)
my_comparisons <-
list(c("Oncogenes", "Tumor Suppressor"),
c("Non-Essential", "Tumor Suppressor"))
p <-
dat_boxplot %>% ggplot(aes(y = scores, x = type, fill = type)) + geom_boxplot() + theme_Publication() +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 18
)) +
theme(legend.position = 'none') + ylab("CERES Score") + xlab("Gene Type") +
stat_compare_means(comparisons = my_comparisons) + ggtitle(tp53_name) +
scale_fill_manual(values = c("#999999", "#E69F00", "#699186")) + geom_text(
data = dat_boxplot,
aes(type, scores, label = GENE),
vjust = -.5,
check_overlap = TRUE
)
p$layers[[2]]$aes_params$textsize <- 8
p
## Warning: Removed 1277 rows containing missing values (geom_text).
oncogenes_scores <-
bf_list %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(BF)
NEG_scores <-
bf_list %>% filter(NEG == 1) %>% droplevels() %>% pull(BF)
tsg_scores <-
bf_list %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(BF)
FP = c(oncogenes_scores, NEG_scores)
pr_bagel <-
pr.curve(
scores.class0 = -tsg_scores ,
scores.class1 = -FP,
curve = TRUE
)
pr_bagel_dat <-
data.frame(
Recall = pr_bagel$curve[, 1],
Precision = pr_bagel$curve[, 2],
Score = pr_bagel$curve[, 3]
)
tsg_scores <- -tsg_scores
pr_bagel_dat$Precision = round(pr_bagel_dat$Precision, digits = 2)
pr_bagel_dat$Recall = round(pr_bagel_dat$Recall, digits = 2)
pr_bagel_dat$Score = round(pr_bagel_dat$Score, digits = 2)
pr_bagel_dat <- pr_bagel_dat %>% unique()
observation_count <-
sapply(pr_bagel_dat$Score, function(x)
length(tsg_scores[tsg_scores > x]))
pr_bagel_dat$count <- observation_count
pr_bagel_dat <- pr_bagel_dat %>% arrange(count)
oncogenes_scores <-
bf_list %>% filter(Cell_Line == tp53_data) %>% filter(entrez_ID %in% oncogenes_all) %>% droplevels() %>% pull(BF)
NEG_scores <-
bf_list %>% filter(Cell_Line == tp53_data) %>% filter(NEG == 1) %>% droplevels() %>% pull(BF)
tsg_scores <-
bf_list %>% filter(Cell_Line == tp53_data) %>% filter(Cosmic_TSG == 1) %>% droplevels() %>% pull(BF)
type <- c()
type[1:length(oncogenes_scores)] <- "Oncogenes"
type[length(oncogenes_scores) + 1:length(NEG_scores)] <-
"Non-Essential"
type[length(oncogenes_scores) + length(NEG_scores) + 1:length(tsg_scores)] <-
"Tumor Suppressor"
plot_labels <-
bf_list %>% filter(Cell_Line == tp53_data) %>% filter(GENE %in% key.gns) %>% dplyr::select(GENE, BF)
plot_labels$type = "Tumor Suppressor"
colnames(plot_labels)[2] = "scores"
scores <- c(oncogenes_scores, NEG_scores, tsg_scores)
dat_boxplot <- data.frame(type, scores)
dat_boxplot <- merge(dat_boxplot, plot_labels, all = TRUE)
my_comparisons <-
list(c("Oncogenes", "Tumor Suppressor"),
c("Non-Essential", "Tumor Suppressor"))
p <-
dat_boxplot %>% ggplot(aes(y = scores, x = type, fill = type)) + geom_boxplot() + theme_Publication() +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 18
)) +
theme(legend.position = 'none') + ylab("Bagel BF") + xlab("Gene Type") +
stat_compare_means(comparisons = my_comparisons) + ggtitle(tp53_name) +
scale_fill_manual(values = c("#999999", "#E69F00", "#699186")) + geom_text(
data = dat_boxplot,
aes(type, scores, label = GENE),
vjust = -.5,
check_overlap = TRUE
)
p$layers[[2]]$aes_params$textsize <- 8
p
## Warning: Removed 1274 rows containing missing values (geom_text).
# pr_ceres_dat
# pr_bagel_dat
# pr_JACKS_dat
# pr_mod_Z_dat
# pr_ranking_dat
# pr_MEAN_FC_dat
pr_mod_Z_dat <- as.data.table(pr_mod_Z_dat)
pr_mod_Z_dat_plot <-
as.data.frame(pr_mod_Z_dat[, .SD[which.max(count)], by = Precision]) %>% fix_count()
pr_bagel_dat <- as.data.table(pr_bagel_dat)
pr_bagel_dat_plot <-
as.data.frame(pr_bagel_dat[, .SD[which.max(count)], by = Precision]) %>% fix_count()
pr_ceres_dat <- as.data.table(pr_ceres_dat)
pr_ceres_dat_plot <-
as.data.frame(pr_ceres_dat[, .SD[which.max(count)], by = Precision]) %>% fix_count()
pr_JACKS_dat <- as.data.table(pr_JACKS_dat)
pr_JACKS_dat_plot <-
as.data.frame(pr_JACKS_dat[, .SD[which.max(count)], by = Precision]) %>% fix_count()
pr_MEAN_FC_dat <- as.data.table(pr_MEAN_FC_dat)
pr_MEAN_FC_dat_plot <-
as.data.frame(pr_MEAN_FC_dat[, .SD[which.max(Score)], by = Precision]) %>% fix_count()
pr_ranking_dat <- as.data.table(pr_ranking_dat)
pr_ranking_dat_plot <-
as.data.frame(pr_ranking_dat[, .SD[which.max(Score)], by = Precision]) %>% fix_count()
ggplot() + ylim(c(0.8, 1)) + xlim(c(0, 1000)) + theme_Publication() + ylab("Precision") + xlab("Count") +
geom_line(
aes(
y = pr_mod_Z_dat_plot$Precision,
x = pr_mod_Z_dat_plot$count,
color = "Shuffled Z-Score"
),
alpha = 0.8,
color = "red"
) +
geom_line(
aes(
y = pr_bagel_dat_plot$Precision,
x = pr_bagel_dat_plot$count,
color = "BAGEL"
),
alpha = 0.8,
color = "green"
) +
geom_line(
aes(
y = pr_ceres_dat_plot$Precision,
x = pr_ceres_dat_plot$count,
color = "CERES"
),
alpha = 0.8,
color = "blue"
) +
geom_line(
aes(
y = pr_ranking_dat_plot$Precision,
x = pr_ranking_dat_plot$count,
color = "Ranking"
),
alpha = 0.8,
color = "violet"
) +
geom_line(
aes(
y = pr_MEAN_FC_dat_plot$Precision,
x = pr_MEAN_FC_dat_plot$count,
color = "Mean FC"
),
alpha = 0.8,
color = "gray"
) +
geom_line(
aes(
y = pr_JACKS_dat_plot$Precision,
x = pr_JACKS_dat_plot$count,
color = "JACKS"
),
alpha = 0.8,
color = "yellow"
) +
theme(legend.position = "right") +
geom_hline(yintercept = 0.9,
color = "black",
linetype = 3)
## Warning: Removed 57 row(s) containing missing values (geom_path).
## Warning: Removed 57 row(s) containing missing values (geom_path).
## Warning: Removed 58 row(s) containing missing values (geom_path).
## Warning: Removed 61 row(s) containing missing values (geom_path).
## Warning: Removed 61 row(s) containing missing values (geom_path).
## Warning: Removed 58 row(s) containing missing values (geom_path).
###Demonstrating the shuffled fold change strategy
rm(list = ls())
if (is.integer(dev.list())){dev.off()}
## null device
## 1
cat("\014")
set.seed(1)
source("functions.R")
cell_line_key <- read_csv("Data/DepMap-2018q4-celllines.csv")
## Parsed with column specification:
## cols(
## DepMap_ID = col_character(),
## CCLE_Name = col_character(),
## Aliases = col_character(),
## COSMIC_ID = col_double(),
## `Sanger ID` = col_double(),
## `Primary Disease` = col_character(),
## `Subtype Disease` = col_character(),
## Gender = col_character(),
## Source = col_character()
## )
## Warning: 2 parsing failures.
## row col expected actual file
## 1329 Source delimiter or quote G 'Data/DepMap-2018q4-celllines.csv'
## 1329 NA 9 columns 15 columns 'Data/DepMap-2018q4-celllines.csv'
cancer_gene_census <- read_csv("Data/cancer_gene_census.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Entrez GeneId` = col_double(),
## Tier = col_double()
## )
## See spec(...) for full column specifications.
cosmic_tsg <- cancer_gene_census %>%
filter(grepl("TSG",`Role in Cancer` ))
###arranging fold_dat together
fold_dat <- read_delim("Data/Avana_FC_Comb_FC_Corrected", "\t", escape_double = FALSE, trim_ws = TRUE)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character(),
## GENE = col_character()
## )
## See spec(...) for full column specifications.
replicate_map <- read_delim("Data/replicate_map.csv", ",", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
## replicate_ID = col_character(),
## DepMap_ID = col_character(),
## pDNA_batch = col_double()
## )
replicate_map <- replicate_map %>% dplyr::select(-pDNA_batch)
fold_dat <- gather(fold_dat,replicate_ID,FC,-c(X1,GENE))
replicate_map_change <- replicate_map
replicate_map_change$replicate_ID <- gsub(' ', '_', replicate_map_change$replicate_ID)
replicate_map_change$replicate_ID <- gsub('/', '_', replicate_map_change$replicate_ID)
replicate_map_change$replicate_ID <- gsub(',', '_', replicate_map_change$replicate_ID)
fold_dat_found <- fold_dat %>% filter(replicate_ID %in% replicate_map$replicate_ID)
fold_dat_miss <- fold_dat %>% filter(replicate_ID %!in% replicate_map$replicate_ID)
#unique(fold_dat_miss$replicate_ID[fold_dat_miss$replicate_ID %!in% replicate_map_change$replicate_ID])
fold_dat_found <- left_join(fold_dat_found,replicate_map,by = "replicate_ID")
fold_dat_miss <- left_join(fold_dat_miss,replicate_map_change,by = "replicate_ID")
fold_dat <- rbind(fold_dat_found,fold_dat_miss)
rm(fold_dat_found,fold_dat_miss)
cells <- sample(unique(fold_dat$DepMap_ID),size = 1) %>% as.vector()
fold_dat <- fold_dat %>% filter(DepMap_ID %in% cells)
temp <- fold_dat
V1 <- as.vector(as.character(temp$GENE))
entrez_IDS <- mapIds(org.Hs.eg.db, V1, 'ENTREZID', 'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
temp$entrez_ID = entrez_IDS
rm(entrez_IDS,V1)
fold_dat_temp <- aggregate(fold_dat$FC, by=list(fold_dat$GENE,fold_dat$DepMap_ID),FUN=mean, na.rm=TRUE)
colnames(fold_dat_temp) = c("GENE","Cell_Line","Mean_FC")
fold_dat_temp$Type = "Original"
###sampling a cell type
cell <- sample(unique(fold_dat$DepMap_ID),size = 1) %>% as.vector()
fold_dat_shuffle <- fold_dat %>% filter(DepMap_ID == cell)
fold_dat_shuffle$FC <- ave(fold_dat_shuffle$FC, fold_dat_shuffle$DepMap_ID, FUN = sample)
fold_dat_shuffle <- aggregate(fold_dat_shuffle$FC, by=list(fold_dat_shuffle$GENE,fold_dat_shuffle$DepMap_ID),FUN=mean, na.rm=TRUE)
colnames(fold_dat_shuffle) = c("GENE","Cell_Line","Mean_FC")
fold_dat_shuffle$Type = "Shuffled"
fold_dat_temp <- rbind(fold_dat_temp,fold_dat_shuffle)
fold_dat_temp$Key = paste(fold_dat_temp$Cell_Line,fold_dat_temp$Type)
fold_dat_temp %>% ggplot(aes(x = Mean_FC , group = Key, fill = Type)) +
geom_density(alpha = 0.8) + theme_Publication() +
scale_fill_manual(
values = c("#d8dee6","#bf3737")
) + theme(legend.position = 'none') + xlab("Mean Log(Fold-Change)") +ylab("Density")
rm(list = ls())
if (is.integer(dev.list())){dev.off()}
## null device
## 1
cat("\014")
set.seed(1)
source("functions.R")
Avana_Shuffle_Corrected_FC <- read.delim("Data/Avana_Shuffle_Corrected_FC.txt")
cell_line_key <- read_csv("Data/DepMap-2018q4-celllines.csv")
## Parsed with column specification:
## cols(
## DepMap_ID = col_character(),
## CCLE_Name = col_character(),
## Aliases = col_character(),
## COSMIC_ID = col_double(),
## `Sanger ID` = col_double(),
## `Primary Disease` = col_character(),
## `Subtype Disease` = col_character(),
## Gender = col_character(),
## Source = col_character()
## )
## Warning: 2 parsing failures.
## row col expected actual file
## 1329 Source delimiter or quote G 'Data/DepMap-2018q4-celllines.csv'
## 1329 NA 9 columns 15 columns 'Data/DepMap-2018q4-celllines.csv'
V1 <- as.vector(as.character(Avana_Shuffle_Corrected_FC$GENE))
entrez_IDS <- mapIds(org.Hs.eg.db, V1, 'ENTREZID', 'SYMBOL')
## 'select()' returned 1:many mapping between keys and columns
Avana_Shuffle_Corrected_FC$entrez_ID = entrez_IDS
rm(entrez_IDS,V1)
try <- aggregate(Avana_Shuffle_Corrected_FC$MEAN_FC, by= list(Avana_Shuffle_Corrected_FC$Cell_Line),mean)
try2 <- aggregate(Avana_Shuffle_Corrected_FC$MEAN_FC, by= list(Avana_Shuffle_Corrected_FC$Cell_Line),sd)
colnames(try) = c("Cell_Line","Mean_FC_Orig")
colnames(try2) = c("Cell_Line","Mean_FC_SD")
temp <- merge(try,try2)
try <- Avana_Shuffle_Corrected_FC %>% dplyr::select(Cell_Line,MEAN_FC_Shuffle_Mean,SD_Shuffle_Mean) %>% unique()
temp <- merge(temp,try)
a <- temp %>% ggplot() + geom_beeswarm(aes(x = "Original", y = Mean_FC_Orig)) + ylab("Mean FC Per Screen") +
geom_beeswarm(aes(x = "Shuffled",y = MEAN_FC_Shuffle_Mean)) + theme_Publication() + xlab("")
print(a)
b <- temp %>% ggplot() + geom_beeswarm(aes(x = "Original", y = Mean_FC_SD )) +
geom_beeswarm(aes(x = "Shuffled",y = SD_Shuffle_Mean)) +
theme_Publication() + xlab("") + ylab("Standard Deviation Per Screen" )
print(b)
wilcox.test(temp$Mean_FC_Orig,temp$MEAN_FC_Shuffle_Mean)
##
## Wilcoxon rank sum test with continuity correction
##
## data: temp$Mean_FC_Orig and temp$MEAN_FC_Shuffle_Mean
## W = 162860, p-value = 0.423
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(temp$Mean_FC_SD,temp$SD_Shuffle_Mean)
##
## Wilcoxon rank sum test with continuity correction
##
## data: temp$Mean_FC_SD and temp$SD_Shuffle_Mean
## W = 274430, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
#plot_grid(a, b, labels = c('Mean FC', 'FC Standard Deviation'), label_size = 20)
rm(list = ls())
if (is.integer(dev.list())){dev.off()}
## null device
## 1
cat("\014")
set.seed(1)
source("functions.R")
avana_res <- read.delim("Data/avana_output_v2.txt")
cell_line_key <- read_csv("Data/DepMap-2018q4-celllines.csv")
## Parsed with column specification:
## cols(
## DepMap_ID = col_character(),
## CCLE_Name = col_character(),
## Aliases = col_character(),
## COSMIC_ID = col_double(),
## `Sanger ID` = col_double(),
## `Primary Disease` = col_character(),
## `Subtype Disease` = col_character(),
## Gender = col_character(),
## Source = col_character()
## )
## Warning: 2 parsing failures.
## row col expected actual file
## 1329 Source delimiter or quote G 'Data/DepMap-2018q4-celllines.csv'
## 1329 NA 9 columns 15 columns 'Data/DepMap-2018q4-celllines.csv'
cancer_gene_census <- read_csv("Data/cancer_gene_census.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Entrez GeneId` = col_double(),
## Tier = col_double()
## )
## See spec(...) for full column specifications.
temp_hits <- avana_res %>% filter(hit == 1) %>% droplevels()
cosmic_tsg <- cancer_gene_census %>%
filter(grepl("TSG",`Role in Cancer` ))
cosmic_tsg <- cancer_gene_census %>%
filter(grepl("TSG", `Role in Cancer`))
num_cells <- length(unique(avana_res$Cell_Line))
temp_cosmic <- temp_hits %>% filter(Cosmic_TSG == 1)
tab <- table(temp_cosmic$GENE)
tab <- tab %>% sort(decreasing = TRUE)
tab <- tab[1:20]
temp <- data.frame(GENE = names(tab), Count = tab)
#####Plotting number of hits
temp %>% ggplot(aes(
x = reorder(Count.Var1, -Count.Freq),
y = Count.Freq,
fill = "#699186"
)) +
geom_bar(stat = "identity", color = '#699186') +
scale_fill_manual("Key", values = c("#469c83")) +
xlab("") +
ylab("Count") +
theme_Publication() +
theme(axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 18
)) +
theme(legend.position = 'none')
temp$Count.Freq = 100 * temp$Count.Freq / num_cells
#####Plotting percentages
temp %>% ggplot(aes(
x = reorder(Count.Var1, -Count.Freq),
y = Count.Freq,
fill = "#699186"
)) +
geom_bar(stat = "identity", color = '#699186') +
scale_fill_manual("Key", values = c("#469c83")) +
xlab("") +
ylab("Percent of Cell Lines") +
theme_Publication() +
theme(axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 18
)) +
theme(legend.position = 'none')
################COSMIC Defined Tumor Suppressor Signature in Genetic Screens.
###Identifying COSMIC TSGs and creating table with general statistics
cosmic_hits <-
avana_res %>% filter(entrez_ID %in% cosmic_tsg$`Entrez GeneId`) %>% droplevels() %>% filter(hit == 1)
cosmic_hits$gene_cell = paste(cosmic_hits$GENE, cosmic_hits$Cell_Line)
cosmic_data <-
avana_res %>% filter(entrez_ID %in% cosmic_hits$entrez_ID) %>% droplevels()
cosmic_data$gene_cell = paste(cosmic_data$GENE, cosmic_data$Cell_Line)
cosmic_data <-
cosmic_data %>% filter(gene_cell %!in% cosmic_hits$gene_cell)
avana_hits <- avana_res %>% filter(hit == 1) %>% droplevels()
cosmic_hits <-
avana_res %>% filter(entrez_ID %in% cosmic_tsg$`Entrez GeneId`) %>% droplevels() %>% filter(hit == 1)
try <- cosmic_hits %>% droplevels()
#unique(try$Cell_Line) # 249
cosmic_hit_genes <- unique(cosmic_hits$GENE)
cosmic_hit_table <-
avana_res %>% filter(GENE %in% cosmic_hit_genes) %>% droplevels()
cosmic_hit_table_1 <- cosmic_hit_table %>% filter(hit == 1)
cosmic_hit_table_0 <- cosmic_hit_table %>% filter(hit == 0)
genes <- c()
wilcox_tests <- c()
fishers_test <- c()
for (gene in unique(cosmic_hit_table$GENE)){
genes <- c(genes,gene)
temp_1 <- cosmic_hit_table_1 %>% filter(GENE == gene)
temp_0 <- cosmic_hit_table_0 %>% filter(GENE == gene)
x <- temp_1 %>% pull(Exp)
y <- temp_0 %>% pull(Exp)
if (!is.na(x[1])){
w_test <- wilcox.test(x,y,alternative = "greater")
w_test <- w_test$p.value
}
if (is.na(x[1])){
w_test <- "N/A"
}
wilcox_tests <- c(wilcox_tests,w_test)
hit_no_mut <- temp_1 %>% filter(mut == 0) %>% nrow()
hit_mut <- temp_1 %>% filter(mut == 1) %>% nrow()
non_hit_no_mut <- temp_0 %>% filter(mut == 0) %>% nrow()
non_hit_mut <- temp_0 %>% filter(mut == 1) %>% nrow()
mutation_table <- matrix(c(hit_mut, non_hit_no_mut,hit_no_mut, non_hit_mut),nrow = 2,dimnames = list(PS_Status = c("PS_Hit", "PS_Non_Hit"), Mut_Status = c("Mut", "No_Mut")))
f_test <- fisher.test(mutation_table, alternative = "less")
fishers_test <- c(fishers_test,f_test$p.value)
}
rm(f_test,w_test,hit_mut,hit_no_mut,non_hit_mut,non_hit_no_mut,x,y,temp_0,temp_1)
hit_test <- tibble(genes,wilcox_tests,fishers_test)
hit_test$wilcox_tests <- as.numeric(hit_test$wilcox_tests)
## Warning: NAs introduced by coercion
#hit_test$wilcox_tests <- round(hit_test$wilcox_tests,digits = 5)
hit_test$fishers_test <- as.numeric(hit_test$fishers_test)
#hit_test$fishers_test <- round(hit_test$fishers_test,digits = 5)
##### Aggregating the expression and mutational rates
cosmic_hit_table_1_count <-
aggregate(
cosmic_hit_table_1$hit,
by = list(cosmic_hit_table_1$GENE),
FUN = sum,
na.rm = TRUE
)
cosmic_hit_table_1_exp_mean <-
aggregate(
cosmic_hit_table_1$Exp,
by = list(cosmic_hit_table_1$GENE),
FUN = mean,
na.rm = TRUE
)
cosmic_hit_table_1_exp_median <-
aggregate(
cosmic_hit_table_1$Exp,
by = list(cosmic_hit_table_1$GENE),
FUN = median,
na.rm = TRUE
)
cosmic_hit_table_1_mut <-
aggregate(
cosmic_hit_table_1$mut,
by = list(cosmic_hit_table_1$GENE),
FUN = mean,
na.rm = TRUE
)
cosmic_hit_table_0_exp_mean <-
aggregate(
cosmic_hit_table_0$Exp,
by = list(cosmic_hit_table_0$GENE),
FUN = mean,
na.rm = TRUE
)
cosmic_hit_table_0_exp_median <-
aggregate(
cosmic_hit_table_0$Exp,
by = list(cosmic_hit_table_0$GENE),
FUN = median,
na.rm = TRUE
)
cosmic_hit_table_0_mut <-
aggregate(
cosmic_hit_table_0$mut,
by = list(cosmic_hit_table_0$GENE),
FUN = mean,
na.rm = TRUE
)
colnames(cosmic_hit_table_1_count) = c("GENE", "Count")
colnames(cosmic_hit_table_1_exp_mean) = c("GENE", "PS_Mean_Exp")
colnames(cosmic_hit_table_1_mut) = c("GENE", "PS_Mut")
colnames(cosmic_hit_table_0_exp_mean) = c("GENE", "Other_Mean_Exp")
colnames(cosmic_hit_table_0_mut) = c("GENE", "Other_Mut")
colnames(cosmic_hit_table_0_exp_median) = c("GENE", "Other_Median_Exp")
colnames(cosmic_hit_table_1_exp_median) = c("GENE", "PS_Median_Exp")
cosmic_hit_table <-
do.call(
"cbind",
list(
cosmic_hit_table_1_count,
cosmic_hit_table_1_exp_mean,
cosmic_hit_table_0_exp_mean,
cosmic_hit_table_1_exp_median,
cosmic_hit_table_0_exp_median,
cosmic_hit_table_1_mut,
cosmic_hit_table_0_mut
)
)
cosmic_hit_table <- cosmic_hit_table[, !duplicated(colnames(cosmic_hit_table))]
head(cosmic_hit_table)
## GENE Count PS_Mean_Exp Other_Mean_Exp PS_Median_Exp Other_Median_Exp PS_Mut
## 1 ARID1A 2 5.731450 4.846014 5.731450 4.906650 0
## 2 ARNT 9 4.426237 4.432091 4.344830 4.405990 0
## 3 ATM 2 4.278285 4.262823 4.278285 4.261155 0
## 4 AXIN1 4 4.082027 4.097776 4.030750 4.112700 0
## 5 BAX 4 7.033775 7.259463 6.973220 7.296315 0
## 6 CASP8 1 4.938760 4.342033 4.938760 4.517280 0
## Other_Mut
## 1 0.16577540
## 2 0.02527076
## 3 0.16934046
## 4 0.03756708
## 5 0.01431127
## 6 0.04982206
colnames(hit_test) <- c("GENE","Expression_Compare_Wilcox_Test_p","Mutations_Compare_Fisher_Test_p")
cosmic_hit_table <- merge(cosmic_hit_table,hit_test)
#write_delim(format(cosmic_hit_table, digits=4), "./Data/Supplementary_Table_1.txt", delim = "\t")
Plotting density plot difference between Expression and mutation rates
cosmic_hit_table$Diff_Exp = cosmic_hit_table$PS_Mean_Exp - cosmic_hit_table$Other_Mean_Exp
cosmic_hit_table$Diff_mut = cosmic_hit_table$PS_Mut - cosmic_hit_table$Other_Mut
cosmic_hit_table %>% ggplot(aes(x = Diff_Exp)) + geom_density() + geom_vline(xintercept = 0,color = "red") +
theme_Publication() + xlab("Δ Mean Gene Expression \n Mean PS - Mean Non PS")
## Warning: Removed 1 rows containing non-finite values (stat_density).
cosmic_hit_table %>% ggplot(aes(x = Diff_mut)) + geom_density() + geom_vline(xintercept = 0,color = "red") +
theme_Publication() + xlab("Δ Mean Mutation Rate \n Mean PS - Mean Non PS")
Plotting boxplot difference between Expression and mutation rates
temp1 <- cosmic_hit_table %>% dplyr::select(PS_Mean_Exp)
temp1$Type <- "PS"
colnames(temp1)[1] <- "Gene_Exp"
temp2 <- cosmic_hit_table %>% dplyr::select(Other_Mean_Exp)
temp2$Type <- "Other"
colnames(temp2)[1] <- "Gene_Exp"
plot_df <- rbind(temp1,temp2)
plot_df %>% ggplot(aes(y = Gene_Exp, x = Type)) + geom_beeswarm() + theme_Publication()+ ylab("Mean Expression") + stat_compare_means(method = "wilcox") +
stat_summary(fun = median, color = "red",
geom = "point")
## Warning: Removed 2 rows containing non-finite values (stat_compare_means).
## Warning: Removed 2 rows containing non-finite values (stat_summary).
## Warning: Removed 2 rows containing missing values (position_beeswarm).
temp1 <- cosmic_hit_table %>% dplyr::select(PS_Mut)
temp1$Type <- "PS"
colnames(temp1)[1] <- "Gene_Mut"
temp2 <- cosmic_hit_table %>% dplyr::select(Other_Mut)
temp2$Type <- "Other"
colnames(temp2)[1] <- "Gene_Mut"
plot_df <- rbind(temp1,temp2)
plot_df %>% ggplot(aes(y = Gene_Mut, x = Type)) + geom_beeswarm() + theme_Publication()+ ylab("Mean Mutation Rate") + stat_compare_means(method = "wilcox")+
stat_summary(fun = median, color = "red",
geom = "point")
At some point R plotting code was broken for 2A & 2B. X Axis needed to be changed to factor instead of numeric. As such, the X intercept in this code is slight shifted to 5.25, as opposed to 5.24. X intercept at 5.24 used for manuscript. Data remains the same.
######## Shuffled_FC TSG Rep
###Cleaning variables from above
rm(
cosmic_hit_genes,
cosmic_hit_table_1_exp_mean,
cosmic_hit_table_0_exp_median,
cosmic_hit_table_0_exp_mean,
cosmic_hit_table_1_exp_median,
cosmic_hit_table,
cosmic_hit_table_0,
cosmic_hit_table_0_exp,
cosmic_hit_table_0_mut,
cosmic_hit_table_1,
cosmic_hit_table_1_count,
cosmic_hit_table_1_exp,
cosmic_hit_table_1_mut
)
## Warning in rm(cosmic_hit_genes, cosmic_hit_table_1_exp_mean,
## cosmic_hit_table_0_exp_median, : object 'cosmic_hit_table_0_exp' not found
## Warning in rm(cosmic_hit_genes, cosmic_hit_table_1_exp_mean,
## cosmic_hit_table_0_exp_median, : object 'cosmic_hit_table_1_exp' not found
###binning representation in 0.25 increments of positive z-score
other = c()
TSG = c()
Frame = seq(0, 25, by = 0.25)
for (i in Frame) {
temp <- avana_res %>%
filter(Mod_Z_Score > i)
other = c(other, length(temp$Cosmic_TSG[temp$Cosmic_TSG == 0]))
TSG = c(TSG, length(temp$Cosmic_TSG[temp$Cosmic_TSG == 1]))
}
TSG_Comparison <- data_frame(TSG, Frame)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
TSG_Comparison2 <- data_frame(other, Frame)
try <- merge(TSG_Comparison, TSG_Comparison2)
TSG_Comparison$Type = "COSMIC TSG"
TSG_Comparison2$Type = "Other Genes"
colnames(TSG_Comparison) <- c("Count", "Frame", "Type")
colnames(TSG_Comparison2) <- c("Count", "Frame", "Type")
TSG_Comparison <- rbind(TSG_Comparison, TSG_Comparison2)
a <-
TSG_Comparison %>% filter(Frame < 20) %>%
ggplot( aes(x = as.factor(Frame), y = Count, fill = Type)) +
geom_bar(position = 'fill',
stat = "identity",
width = 1) +
xlab("Shuffled Z-Score)")+
geom_vline(xintercept = "5.25", color = "black") +
scale_y_continuous(labels = scales::percent_format()) +
ylab("Percentage of Representation") +
scale_fill_manual(values = c("#469c83", "#d8dee6")) + #,
# name = c("Gene Representation"),
# guide = guide_legend(direction = "horizontal")) +)
theme_Publication() + theme(legend.position = "none")
a
Uses geom_bar due to plot a similarity and ease.
b <-
TSG_Comparison %>% filter(Frame < 20) %>%
ggplot( aes(x = as.factor(Frame), y = Count, fill = Type)) +
geom_bar(stat = "identity", width = 1) +
#scale_y_continuous(labels = scales::percent_format()) +
xlab("Shuffled Z-Score") +
geom_vline(xintercept = "5.25", color = "black") +
#ylab("Percentage of Representation") +
ylab("Number of Genes") +
scale_fill_manual(
values = c("#469c83", "#d8dee6"),
name = c("Gene Representation"),
guide = guide_legend(direction = "vertical")
) + scale_y_log10() +
theme_Publication() + theme(legend.position = "none") #+theme(legend.title = element_blank())
b
cell_meta <-
cell_line_key %>% dplyr::select(DepMap_ID, `Primary Disease`)
colnames(cell_meta) = c("Cell_Line", "Disease_Type")
avana_res <- merge(avana_res, cell_meta)
head(avana_res)
## Cell_Line entrez_ID GENE MEAN_FC MEAN_FC_Shuffle_Mean SD_Shuffle_Mean n
## 1 ACH-000004 1 A1BG 0.185207 -0.007037 0.249472 8
## 2 ACH-000004 10 NAT2 0.166846 -0.007037 0.249472 8
## 3 ACH-000004 100 ADA -0.062245 -0.007037 0.249472 8
## 4 ACH-000004 1000 CDH2 0.360443 -0.007037 0.249472 8
## 5 ACH-000004 10000 AKT3 0.419141 -0.007037 0.249472 8
## 6 ACH-000004 10001 MED6 -0.962199 -0.007037 0.249472 8
## n_above percentage geneScores Mean_FC_Z_Score GENE_Exp Exp Cosmic_TSG NEG
## 1 4 0.500 0.30632 0.45540 A1BG 4.19298 0 0
## 2 5 0.625 0.09036 0.41178 NAT2 0.00000 0 0
## 3 4 0.500 0.21586 -0.13249 ADA 3.33485 0 0
## 4 8 1.000 0.05554 0.87172 CDH2 0.12433 0 0
## 5 6 0.750 0.06783 1.01118 AKT3 0.66903 0 0
## 6 2 0.250 1.00000 -2.27063 MED6 4.70653 0 0
## Mod_Z_Score ranking mut hit Disease_Type
## 1 0.77060 11566.5 0 0 Leukemia
## 2 0.69700 11035.0 0 0 Leukemia
## 3 -0.22130 5280.0 0 0 Leukemia
## 4 1.47303 15551.0 0 0 Leukemia
## 5 1.70832 16309.0 0 0 Leukemia
## 6 -3.82873 722.0 0 0 Leukemia
cells <-
avana_res %>% dplyr::select(Cell_Line, Disease_Type) %>% unique()
cells_hits <-
avana_res %>% filter(hit == 1) %>% dplyr::select(Cell_Line, Disease_Type) %>% droplevels() %>% unique()
cells$hit = "No Hit"
cells$hit[cells$Cell_Line %in% cells_hits$Cell_Line] = "Hit Found"
cells_tab <-
as.matrix(table(cells$Disease_Type, cells$hit))
#cells %>% ggplot(aes(x = Disease_Type, fill = hit)) + geom_bar(stat = "count", position="stack") + theme_Publication() + theme(axis.text.x = element_text(angle = 90, hjust = 1,size = 10))
par(las = 1, mar = c(18, 2.5, 2, 2))
barplot(
t(cells_tab),
#col=colors()[c(23,89)] ,
col = c("#469c83", "#ffffff"),
border = "black",
font.axis = 2,
#beside=T,
las = 2,
#xlab="group",
font.lab = 1,
cex.names = 1.5,
cex.axis = 1.5
)
#legend("topright",legend = colnames(cells_tab),fill = colors()[c(23,89)],cex = 1.5)
legend(
"topright",
legend = colnames(cells_tab),
fill = c("#469c83", "#ffffff"),
cex = 1.5
)
CCLE_mutations <-
read.csv("Data/CCLE_mutations.csv")
PTEN_CN <-
read.delim("Data/PTEN_CN.txt")
CCLE_mutations <-
CCLE_mutations %>% filter(Hugo_Symbol %in% c("TP53", "PTEN")) %>%
filter(Variant_Classification != "Silent") %>%
dplyr::select(Hugo_Symbol, DepMap_ID) %>%
unique()
temp_cosmic <-
temp_hits %>% filter(Cosmic_TSG == 1)
tab <- table(temp_cosmic$GENE)
tab <- tab %>% sort(decreasing = TRUE)
tab <- tab[tab > 7]
picked_genes <- names(tab)
temp_hits <-
avana_res %>% filter(GENE %in% picked_genes) %>% #filter(Cosmic_TSG == 1) %>%
droplevels() %>% dplyr::select(GENE, Cell_Line, hit) %>% unique() %>% spread(Cell_Line, hit)
#head(temp_hits)
#temp_hits %>%
#temp <- temp_hits[-1][colSums(temp_hits[-1]) > 0]
#temp_hits <- temp_hits %>% dplyr::select(GENE,colnames(temp))
#####Selecting the top genes
rownames(temp_hits) = temp_hits$GENE
temp_hits$GENE = NULL
tab
##
## PTEN TP53 NF2 TSC1 TSC2 KEAP1 CDKN1A CDKN2C EP300 ARNT NF1
## 92 79 52 51 49 33 25 13 13 9 9
## CHEK2 RB1
## 8 8
PTEN_Cells <-
avana_res %>% filter(GENE == "PTEN") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
TP53_Cells <-
avana_res %>% filter(GENE == "TP53") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
NF2_Cells <-
avana_res %>% filter(GENE == "NF2") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
TSC1_Cells <-
avana_res %>% filter(GENE == "TSC1") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
TSC2_Cells <-
avana_res %>% filter(GENE == "TSC2") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
KEAP1_Cells <-
avana_res %>% filter(GENE == "KEAP1") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
CDKN1A_Cells <-
avana_res %>% filter(GENE == "CDKN1A") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
CDKN2C_Cells <-
avana_res %>% filter(GENE == "CDKN2C") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
EP300_Cells <-
avana_res %>% filter(GENE == "EP300") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
ARNT_Cells <-
avana_res %>% filter(GENE == "ARNT") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
NF1_Cells <-
avana_res %>% filter(GENE == "NF1") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
CHEK2_Cells <-
avana_res %>% filter(GENE == "CHEK2") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
RB1_Cells <-
avana_res %>% filter(GENE == "RB1") %>% filter(hit == 1) %>% pull(Cell_Line) %>% droplevels() %>% as.vector()
Other_Cells <-
avana_res %>% #filter(Cell_Line %in% colnames(temp)) %>%
filter(
Cell_Line %!in% c(
PTEN_Cells,
TP53_Cells,
NF2_Cells,
TSC1_Cells,
TSC2_Cells,
KEAP1_Cells,
CDKN1A_Cells,
CDKN2C_Cells,
EP300_Cells,
ARNT_Cells,
NF1_Cells,
CHEK2_Cells,
RB1_Cells
)
) %>%
pull(Cell_Line) %>% droplevels() %>% as.vector() %>% unique()
###Setting order by ascending number of gene hits (TSGs with highest gene count)
temp_hits <-
temp_hits[unique(
c(
PTEN_Cells,
TP53_Cells,
NF2_Cells,
TSC1_Cells,
TSC2_Cells,
KEAP1_Cells,
CDKN1A_Cells,
CDKN2C_Cells,
EP300_Cells,
ARNT_Cells,
NF1_Cells,
CHEK2_Cells,
RB1_Cells,
Other_Cells
)
)]
order = names(tab)
temp_hits <-
temp_hits[match(order, rownames(temp_hits)),]
PTEN_CN_cells = as.vector(integer(length(colnames(temp_hits))))
TP53_mut = as.vector(integer(length(colnames(temp_hits))))
PTEN_mut = as.vector(integer(length(colnames(temp_hits))))
pten_muts <-
CCLE_mutations$DepMap_ID[CCLE_mutations$Hugo_Symbol == "PTEN"]
pten_muts <-
as.vector(pten_muts[pten_muts %in% colnames(temp_hits)])
tp53_muts <-
CCLE_mutations$DepMap_ID[CCLE_mutations$Hugo_Symbol == "TP53"]
#####0.3 represents a good marker of PTEN CN status for this particular dataset. It shouldn't be used as an absolute.
pten_CN_loss <-
as.vector(PTEN_CN$Cell_Line[PTEN_CN$PTEN_CN < 0.3])
tp53_muts <-
as.vector(tp53_muts[tp53_muts %in% colnames(temp_hits)])
pten_CN_loss <-
as.vector(pten_CN_loss[pten_CN_loss %in% colnames(temp_hits)])
TP53_mut[colnames(temp_hits) %in% CCLE_mutations$DepMap_ID[CCLE_mutations$Hugo_Symbol == "TP53"]] = 1
PTEN_CN_cells[colnames(temp_hits) %in% pten_CN_loss] = -1
PTEN_mut[colnames(temp_hits) %in% CCLE_mutations$DepMap_ID[CCLE_mutations$Hugo_Symbol == "PTEN"]] = 1
col_fun = colorRamp2(c(-1, 0, 1), c("#294ef2", "#f2eded", "#de962a"))
#col_fun = colorRamp2(c(-1,0, 1), c("#294ef2","#f2eded","#de962a"))
col = list(foo = col_fun)
###top heatmap annotation for mutations and copy number
ha1 <-
HeatmapAnnotation(
foo = cbind(
TP53_Mutation = TP53_mut,
PTEN_Mutation = PTEN_mut,
PTEN_CN_Loss = PTEN_CN_cells
),
annotation_name_side = "right",
col = col,
show_legend = c("foo" = FALSE),
show_annotation_name = c(foo = TRUE),
# only turn off `bar`
height = unit(1.5, "cm"),
name = "foo_ann"
)
cols = c("#ffffff", "#469c83")
# pdf(
# "./paper_figs_code/fig/2-Mutual_Exclus_Heatmap.pdf",
# width = 10,
# height = 10
# )
Heatmap(
as.matrix(temp_hits),
col = cols,
cluster_rows = FALSE,
cluster_columns = FALSE,
top_annotation = ha1,
name = "foo",
show_heatmap_legend = FALSE #Colv=NA,scale = "none", col = cols,Rowv = NA
)
decorate_heatmap_body("foo", {
grid.rect(gp = gpar(fill = "transparent", col = "black", lwd = 1))
})
rm(list = ls())
if (is.integer(dev.list())) {
dev.off()
}
## null device
## 1